import org.apache.spark.{SparkConf, SparkContext}

object WordCount extends App {

  //创建 SparkConf
  val sparkConf = new SparkConf()
  sparkConf.setAppName("spark-word-count-on-yarn")
  val sc = SparkContext.getOrCreate(sparkConf)
  //定义读取 hdfs 上数据的 rdd
  val hdfsRdd = sc.textFile("hdfs://172.20.0.2:9000/user/root/day8lx/customers.csv")
  //提取数据 first_name 和 last_name
  val firstNameAndLastNameRdd = hdfsRdd.map(line => {
    val Array(_, first_name, last_name, _*) = line.split(",")
    (first_name, last_name)
  })
  //客户中的前5个最大家族
  firstNameAndLastNameRdd.map(x => {
    (x._2, 1)
  }).reduceByKey(_ + _).sortBy(_._2, false).take(5).foreach(println(_))
  //客户中的前10个最流行的名字
  firstNameAndLastNameRdd.map(x => {
    (x._1, 1)
  }).reduceByKey(_ + _).sortBy(_._2, false).take(10).foreach(println(_))
}
